In [1]:
import os
import sys

In [2]:
sys.path.append(os.path.join(os.path.abspath('../..'), 'src'))

In [24]:
import mysql_utils

Get Overall Summary of Report Types


In [25]:
with mysql_utils.curWith("SELECT report_type,COUNT(*) as count FROM camp_doc_lu GROUP BY report_type ORDER BY count DESC") as cur:
    report_counts = mysql_utils.dfDocsFromCursor(cur)

In [26]:
report_counts


Out[26]:
report_type count
0 First Quarter 3742
1 Third Quarter 3459
2 Second Quarter 3415
3 Fourth Quarter 3079
4 Miscellaneous Correspondence 2744
5 None 2660
6 48-Hour 2252
7 Year End Semi-Annual 2178
8 Mid Year Semi-Annual 1995
9 Penalty Assessment Letter 1508
10 Certification of Treasurer 1072
11 Organizational 946
12 Correspondence Returned Undeliverable or Uncla... 756
13 Non-Compliance Letter 708
14 Certification to Close Committee 632
15 Paid Penalty Assessment or Forfeiture 582
16 Penalty Appeal 581
17 Final 572
18 Candidate Designation of Committee Funds 417
19 Certification of Inactive Status 273
20 Certification of Threshold 214
21 Audit Letter 177
22 Political Party Executive Committee Exempt Sal... 176
23 Certification of Return to Active Status 161
24 Independent Expenditure Report 140
25 Independent Expenditure for Registered Committees 136
26 Notice of Termination of Active Status 90
27 Electioneering Communications Report 82
28 Loan Proceeds Statement 70
29 Penalty Waiver Letter 66
30 Independent Expenditure for non-Committees 55
31 Forgiven Loan Statement 31
32 Special 24
33 24-Hour Electioneering Communications 12
34 Pre-Referendum 8
35 Supplemental Final 7
36 Contribution from a Business Account Statement 7
37 Deferred Notice 5
38 Thirty-five-day 2
39 District Attorney Letter 2
40 Pre-Primary 2
41 Annual 2
42 Ten-day 2
43 Certification of Incorporated Political Committee 2
44 Penalty Assessment - 30 Days Aged 1
45 Penalty Resolution Agreement Executed 1
46 Pre-Election 1
47 Notice of Candidacy 1
48 Penalty Appeal Decision 1
49 Pre-Runoff 1

Semi-Annual Reports


In [38]:
# Query
report_types = ('Mid Year Semi-Annual', 'Year End Semi-Annual')
semiann_query = "SELECT * FROM camp_doc_lu WHERE report_type IN (" + \
                ', '.join(["'{}'".format(report_types[i]) for i in range(len(report_types))]) +\
                ")" +\
                "AND report_year=2014"
with mysql_utils.curWith(semiann_query) as cur:
    sadf = mysql_utils.dfDocsFromCursor(cur)

In [39]:
sadf.shape


Out[39]:
(373, 10)

In [40]:
sadf.head()


Out[40]:
id committee report_year report_type amend rec_date start_date end_date image_link data_link
0 80 08TH CONG DIST BLACK CAUCUS PAC 2014 Year End Semi-Annual 0 2015-02-02 2014-01-01 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
1 83 08TH CONG DIST BLACK CAUCUS PAC 2014 Mid Year Semi-Annual 0 2014-07-25 2014-01-01 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
2 94 08TH CONG DIST DEC 2014 Mid Year Semi-Annual 0 2014-07-14 2014-01-01 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
3 285 AJ DAOUD CAMPAIGN 2014 Mid Year Semi-Annual 0 2015-08-10 2014-01-01 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
4 286 AJ DAOUD CAMPAIGN 2014 Year End Semi-Annual 1 2015-08-10 2014-07-01 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...

Quarterly Reports


In [32]:
# Query
report_types = ('First Quarter', 'Second Quarter', 'Third Quarter', 'Fourth Quarter')
quarterly_query = "SELECT * FROM camp_doc_lu WHERE report_type IN (" + \
                ', '.join(["'{}'".format(report_types[i]) for i in range(len(report_types))]) +\
                ")" +\
                "AND report_year=2014"
with mysql_utils.curWith(quarterly_query) as cur:
    qdf = mysql_utils.dfDocsFromCursor(cur)

In [33]:
qdf.shape


Out[33]:
(7124, 10)

In [34]:
qdf.head()


Out[34]:
id committee report_year report_type amend rec_date start_date end_date image_link data_link
0 1 01ST CONG DIST DEC 2014 Fourth Quarter 0 2015-01-06 2014-10-19 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
1 2 01ST CONG DIST DEC 2014 Third Quarter 1 2015-01-06 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
2 5 01ST CONG DIST DEC 2014 Third Quarter 0 2014-10-27 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
3 6 01ST CONG DIST DEC 2014 Second Quarter 0 2014-07-07 2014-04-01 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
4 7 01ST CONG DIST DEC 2014 First Quarter 0 2014-04-29 2014-01-01 2014-03-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...

In [35]:
# Filter out reports without data links
qdf = qdf[qdf.data_link != '']

In [36]:
qdf.shape


Out[36]:
(3845, 10)

In [37]:
qdf.head()


Out[37]:
id committee report_year report_type amend rec_date start_date end_date image_link data_link
0 1 01ST CONG DIST DEC 2014 Fourth Quarter 0 2015-01-06 2014-10-19 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
1 2 01ST CONG DIST DEC 2014 Third Quarter 1 2015-01-06 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
3 6 01ST CONG DIST DEC 2014 Second Quarter 0 2014-07-07 2014-04-01 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
4 7 01ST CONG DIST DEC 2014 First Quarter 0 2014-04-29 2014-01-01 2014-03-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
5 8 01ST CONG DIST REC 2014 Fourth Quarter 0 2015-01-12 2014-10-19 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...

Data from "Data" Links


In [43]:
sub = qdf.sample(25)

In [45]:
from time import sleep
import requests

In [46]:
data_pages = []
for ind in list(sub.index):
    did = sub.ix[ind]
    link = sub.ix[ind]['data_link']
    response = requests.get(link)
    if response.status_code == requests.codes.ok:
        data_pages.append({'did' : did,
                           'html' : response.text})
    sleep(1)

In [52]:
len(data_pages)


Out[52]:
25

In [47]:
import pickle

In [49]:
import utils

In [51]:
with open(os.path.join(utils.get_data_dir(), 'interim', 'data_html_sample.pkl'), 'wb') as f:
    pickle.dump(data_pages, f)

In [53]:
import pandas

In [54]:
data_pages = pandas.DataFrame(data_pages)

In [55]:
from bs4 import BeautifulSoup as bs

In [56]:
data_pages['soup'] = data_pages.html.apply(lambda x: bs(x, 'html.parser'))

In [58]:
indx = data_pages.index

In [59]:
soup = data_pages.ix[indx[0]]['soup']

In [60]:
tables = soup.find_all('table')

In [61]:
len(tables)


Out[61]:
1

In [62]:
table = tables[0]

In [63]:
table.find_all('td')


Out[63]:
[<td><b>Section</b></td>,
 <td><b>Count</b></td>,
 <td><b>Link</b></td>,
 <td align="left" width="45%">All Report Sections Listed Below</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=ALL">HTML View</a></td>,
 <td align="left" width="45%">Report Cover</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=CVR">HTML View</a></td>,
 <td align="left" width="45%">Detailed Summary</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=SUM">HTML View</a></td>,
 <td align="left" width="45%">Detailed Receipts</td>,
 <td align="center" width="10%">294</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=REC">HTML View</a></td>,
 <td align="left" width="45%">Detailed Expenditures</td>,
 <td align="center" width="10%">101</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=EXP">HTML View</a></td>,
 <td align="left" width="45%">Loan Proceeds</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Loan Repayments</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Forgiven Loans</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Outstanding Loans</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Debts and Obligations Owed TO the Committee</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Debts and Obligations Owed BY the Committee</td>,
 <td align="center" width="10%">8</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=DBY">HTML View</a></td>,
 <td align="left" width="45%">Account Transfers Within the Committee</td>,
 <td align="center" width="10%">3</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=129409&amp;TP=TRN">HTML View</a></td>,
 <td align="left" width="45%">Administrative Support</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>]

In [64]:
data_pages.ix[indx[5]]['soup'].find('table').find_all('td')


Out[64]:
[<td><b>Section</b></td>,
 <td><b>Count</b></td>,
 <td><b>Link</b></td>,
 <td align="left" width="45%">All Report Sections Listed Below</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=ALL">HTML View</a></td>,
 <td align="left" width="45%">Report Cover</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=CVR">HTML View</a></td>,
 <td align="left" width="45%">Detailed Summary</td>,
 <td align="center" width="10%"> </td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=SUM">HTML View</a></td>,
 <td align="left" width="45%">Detailed Receipts</td>,
 <td align="center" width="10%">97</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=REC">HTML View</a></td>,
 <td align="left" width="45%">Detailed Expenditures</td>,
 <td align="center" width="10%">28</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=EXP">HTML View</a></td>,
 <td align="left" width="45%">Loan Proceeds</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Loan Repayments</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Forgiven Loans</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Outstanding Loans</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Debts and Obligations Owed TO the Committee</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Debts and Obligations Owed BY the Committee</td>,
 <td align="center" width="10%">2</td>,
 <td align="center" width="45%"><a href="cf_report_detail.aspx?RID=137424&amp;TP=DBY">HTML View</a></td>,
 <td align="left" width="45%">Account Transfers Within the Committee</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>,
 <td align="left" width="45%">Administrative Support</td>,
 <td align="center" width="10%">0</td>,
 <td align="center" width="45%"><a>HTML View</a></td>]

In [70]:
def data_from_statementtable(table):
    # Three columns
    data = []
    entries = table.find_all('td')
    header = [e.text for e in entries[:3]]
    for i in range(3,len(entries),3):
        subd = []
        for c,e in zip(header, entries[i:(i+3)]):
            if c != 'Link':
                subd.append(e.text)
            else:
                try:
                    subd.append(e.find('a')['href'])
                except KeyError:
                    subd.append('')
        data.append(subd)
    df = pandas.DataFrame(data, columns=header)
    return(df)

In [71]:
d = data_from_statementtable(table)

In [72]:
d


Out[72]:
Section Count Link
0 All Report Sections Listed Below cf_report_detail.aspx?RID=129409&TP=ALL
1 Report Cover cf_report_detail.aspx?RID=129409&TP=CVR
2 Detailed Summary cf_report_detail.aspx?RID=129409&TP=SUM
3 Detailed Receipts 294 cf_report_detail.aspx?RID=129409&TP=REC
4 Detailed Expenditures 101 cf_report_detail.aspx?RID=129409&TP=EXP
5 Loan Proceeds 0
6 Loan Repayments 0
7 Forgiven Loans 0
8 Outstanding Loans 0
9 Debts and Obligations Owed TO the Committee 0
10 Debts and Obligations Owed BY the Committee 8 cf_report_detail.aspx?RID=129409&TP=DBY
11 Account Transfers Within the Committee 3 cf_report_detail.aspx?RID=129409&TP=TRN
12 Administrative Support 0

In [ ]: